# library(RColorBrewer)
# fancyRpartPlot(<decision tree>)
library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
fancyRpartPlot(top.50.tree.1)
# Make predictions:
# <predictions> <- predict(object = <decision tree>,
#                          newdata = <test dataset>,
#                          type = "class")
# <predictions>[<i1>:<ik>]                            # examine some of the predictions
# <predictions dataframe> <-
#       data.frame(<observation ID> = <test dataset>$<observation ID column>)
top.50.predictions.1 <- predict(top.50.tree.1, newdata = test.data, type = "class")
top.50.predictions.1[1:20]
top.50.predictions.1.dataframe <- data.frame(Song = test.data$Title,
Top.50.Billboard = test.data$Top.50.Billboard,
Top.50.BB = test.data$Top.50.BB,
Prediction = top.50.predictions.1)
# How good are the predictions?
# Compute confusion matrix:
# <cm> <- table(True = <test dataset>$<output variable>,
#               Predicted = <test dataset>$<predictions>)
# Compute evaluation metrics:
# accuracy = (TP + TN) / N
# precision = TP / (TP + FP)
# recall = TP / (TP + FN)
# F1 = (2 * precision * recall) / (precision + recall)
# Note: precision and recall are inversely proportional to each other.
# <evaluation metrics vector> <- <user-specified function>(<cm>)
#   accuracy = sum(diag(cm)) / sum(cm)
#   precision <- TP / (TP + FP)
#   recall <- TP / (TP + FN)
#   F1 <- (2 * precision * recall) / (precision + recall)
cm.1 <- table(True = test.data$Top.50.BB, Predicted = top.50.predictions.1)
cm.1
# alternatively:
# cm.1 <- table(True = top.50.predictions.1.dataframe$Top.50.BB,
#               Predicted = top.50.predictions.1.dataframe$Prediction)
# cm.1
source("Evaluation metrics.R")
eval.1 <- get.evaluation.metrics(cm.1)
eval.1
# Try another tree, using fewer and/or different predictors
# (e.g., Duration + Covered.by, Duration + Year, etc.).
# In practice, strong predictors from the previous tree are kept.
# Also try changing the seed in splitting the dataset into
# train and test sets.
top.50.tree.2 <- rpart(Top.50.BB ~ Duration + Covered.by,
data = train.data,
method = "class")
# top.50.tree.2 <- rpart(Top.50.BB ~ .,       # use almost all variables, excluding some specific ones
#                        data = subset(train.data, select = -c(Title, Top.50.Billboard)),
#                        method = "class")
print(top.50.tree.2)
fancyRpartPlot(top.50.tree.2)
top.50.predictions.2 <- predict(top.50.tree.2, newdata = test.data, type = "class")
top.50.predictions.2[1:20]
top.50.predictions.2.dataframe <- data.frame(Song = test.data$Title,
Top.50.Billboard = test.data$Top.50.Billboard,
Top.50.BB = test.data$Top.50.BB,
Prediction = top.50.predictions.2)
cm.2 <- table(True = test.data$Top.50.BB, Predicted = top.50.predictions.2)
cm.2
eval.2 <- get.evaluation.metrics(cm.2)
eval.2
# Controlling rpart parameters:
# cp (complexity parameter) - don't split at a node if the split does not improve the model by at least cp (default: 0.01)
# minsplit - don't attempt a split at a node if the number of observations is not higher than minsplit (default: 20)
# install.packages("rpart")
# library(rpart)
# <decision tree> <- rpart(<output variable> ~                                     # build the tree
#                          <predictor variable 1> + <predictor variable 2> + ...,  # . to include all variables
#                          data = <train dataset>,
#                          method = "class",                                       # build classification tree
#                          control = rpart.control(minsplit = <n>, cp = <q>))      # decrease both for larger tree
# print(<decision tree>)                                                           # default textual representation
top.50.tree.3 <- rpart(Top.50.BB ~ Single.certification + Covered.by + Year,
data = train.data,
method = "class",
control = rpart.control(minsplit = 10, cp = 0.001))
print(top.50.tree.3)
fancyRpartPlot(top.50.tree.3)
top.50.predictions.3 <- predict(top.50.tree.3, newdata = test.data, type = "class")
top.50.predictions.3[1:20]
top.50.predictions.3.dataframe <- data.frame(Song = test.data$Title,
Top.50.Billboard = test.data$Top.50.Billboard,
Top.50.BB = test.data$Top.50.BB,
Prediction = top.50.predictions.3)
cm.3 <- table(True = test.data$Top.50.BB, Predicted = top.50.predictions.3)
cm.3
eval.3 <- get.evaluation.metrics(cm.3)
eval.3
# Compare the results (the corresponding models/trees):
# data.frame(rbind(<evaluation metrics 1>, <evaluation metrics 2>),
#            row.names = c("<tree 1>", "<tree 2>"))
data.frame(rbind(eval.1, eval.3),
row.names = c("top.50.tree.1", "top.50.tree.3"))
# Model 3 exhibits overfitting. It is a frequent case with large trees.
# Cross-validate the model - find the optimal value for cp (the most important parameter),
# in order to avoid overfitting the model to the training data:
# install.packages("e1071")                                     # relevant caret functions need e1071
# install.packages("caret")
# library(e1071)
# library(caret)
# <folds> = trainControl(method = "cv", number = <k>)           # define <k>-fold cross-validation parameters
# <cpGrid> = expand.grid(.cp =                                  # specify the range of the cp values to examine
#                        seq(from = <start value>, to = <end value>, by = <step>))
# set.seed(<seed>)
# train(<output variable> ~                                     # find the optimal value for cp
#       <predictor variable 1> + <predictor variable 2> + ...,  # . to include all variables
#       data = <train dataset>,
#       method = "rpart",                                       # use rpart() to build multiple classification trees
#       control = rpart.control(minsplit = 10),
#       trControl = <folds>, tuneGrid = <cpGrid>)               # <folds> and <cpGrid> from above
library(e1071)
library(caret)
folds = trainControl(method = "cv", number = 10)                # 10-fold cross-validation
cpGrid = expand.grid(.cp = seq(from = 0.001, to = 0.05, by = 0.001))
set.seed(11)
train(Top.50.BB ~ Single.certification + Covered.by + Year,
data = train.data,
method = "rpart",
control = rpart.control(minsplit = 10),
trControl = folds, tuneGrid = cpGrid)
top.50.tree.5 <- prune(top.50.tree.3, cp = 0.013)    # cp value found in the previous step (train())
print(top.50.tree.5)
fancyRpartPlot(top.50.tree.5)
# Make predictions with the prunned tree:
top.50.predictions.5 <- predict(top.50.tree.5, newdata = test.data, type = "class")
top.50.predictions.5[1:20]
top.50.predictions.5.dataframe <- data.frame(Song = test.data$Title,
Top.50.Billboard = test.data$Top.50.Billboard,
Top.50.BB = test.data$Top.50.BB,
Prediction = top.50.predictions.5)
cm.5 <- table(True = test.data$Top.50.BB, Predicted = top.50.predictions.5)
cm.5
eval.5 <- get.evaluation.metrics(cm.5)
eval.5
# Compare all relevant models:
# data.frame(rbind(<evaluation metrics 1>, <evaluation metrics 2>, ...),
#            row.names = c("<tree 1>", "<tree 2>", ...))
data.frame(rbind(eval.1, eval.3, eval.5),
row.names = c("top.50.tree.1", "top.50.tree.3", "top.50.tree.5"))
#######
# KNN #
#######
# Reading the dataset
# Restoring the dataset from the corresponding RData file:
# <dataframe or another R object> <- readRDS(file = "<filename>")         # restore R object in the next session
# The Beatles songs dataset has been saved earlier using:
# saveRDS(object = the.beatles.songs, file = "The Beatles songs dataset, v3.4.RData")
the.beatles.songs <- readRDS("The Beatles songs dataset, v3.4.RData")
# Rescaling of numeric variables needed?
# summary(<dataframe>)                    # examine the ranges of numeric variables
summary(the.beatles.songs)
# Check if numeric variables follow normal distribution:
# summary(<numeric variable>)           # the mean and the median values similar: probably normal distribution
# plot(density((<numeric variable>))    # visual inspection
# hist(<numeric variable>)              # visual inspection
# qqnorm(<numeric variable>)            # values lie more or less along the diagonal (straight line)
# shapiro.test(<numeric variable>)      # good for small sample sizes, e.g. n < ~2000; H0: normal distribution
plot(density(the.beatles.songs$Covered.by))
# ...                                   # check the distributions of other variables as well
source("Rescale numeric variables.R")
# the.beatles.songs.rescaled.old <- NormalizeNumericVariables(the.beatles.songs)
the.beatles.songs.rescaled <- RescaleNumericVariables(the.beatles.songs)
# Split the dataset into train and test sets:
# install.packages("caret")
# library(caret)
# set.seed(<n>)
# <train dataset indices> <-                            # stratified partitioning:
#     createDataPartition(<dataset>$<output variable>,  # the same distribution of the output variable in both sets
#                         p = .80,                      # 80/20% of data in train/test sets
#                         list = FALSE)                 # don't make a list of results, make a matrix
# <train dataset> <- <dataset>[<train dataset indices>, ]
# <test dataset>  <- <dataset>[-<train dataset indices>, ]
library(caret)
set.seed(444)
# set.seed(333) - results in a different split, and different results and eval. metrics
train.data.indices <- createDataPartition(the.beatles.songs.rescaled$Top.50.BB, p = 0.80, list = FALSE)
train.data <- the.beatles.songs.rescaled[train.data.indices, ]
test.data <- the.beatles.songs.rescaled[-train.data.indices, ]
# Build the model:
# library(class)
# <knn model> <- knn(train = <training dataset>,        # training data without the output (class) variable
#                    test = <test dataset>,             # test data without the output (class) variable
#                    cl = <class values for training>,  # output (class) variable is specified here
#                    k = <n>)                           # <n>: random guess, or obtained from cross-validation
# head(<knn model>)
library(class)
head(train.data)
head(train.data[, c(-1, -11)])
head(train.data[, -c(1, 11)])
source("Evaluation metrics.R")
eval.1 <- getEvaluationMetrics(cm.1)
eval.1
data.frame(rbind(eval.1, eval.3),
row.names = c("top.50.tree.1", "top.50.tree.3"))
the.beatles.songs <- readRDS("The Beatles songs dataset, v3.4.RData")
str(the.beatles.songs)
apply(the.beatles.songs[, c(3:4, 7:10)], MARGIN = 2, FUN = shapiro.test)   # no normally distributed numeric vars
library(bnlearn)
?discretize
View(the.beatles.songs)
summary(the.beatles.songs)
table(the.beatles.songs$Top.50.Billboard)
table(the.beatles.songs$Top.50.Rolling.Stone)
discretized.features <- discretize(the.beatles.songs[, c(3:4, 7:10)],
method = "interval",
breaks = c(5, 5, 5, 5, 5, 5))
summary(discretized.features)
t <- the.beatles.songs$Top.50.Billboard
summary(t)
install.packages("binr")
install.packages("mltools")
# das[, "wt2"] <- bin_data(das$wt, bins=3, binType = "quantile")
t.bin_data <- bin_data(t, bins = 3, binType = "quantile")
library(mltools)
library(binr)
t.bin_data <- bin_data(t, bins = 3, binType = "quantile")
t.bin_data
t.bin_data <- bin_data(t, bins = c(-Inf, 0, 10, 20, 30, 40, 50), binType = "explicit")
t.bin_data
class(t.bin_data)
t.bin_data <- bin_data(t, bins = c(-Inf, 0, 1, 10, 20, 30, 40, 50), binType = "explicit")
t.bin_data
t.bin_data <- bin_data(t, bins = c(0, 1, 10, 20, 30, 40, 50), binType = "explicit")
t.bin_data
d.d <- discretize(the.beatles.songs[, 10)], method = "quantile", breaks = 10)
d.d <- discretize(the.beatles.songs[, 10], method = "quantile", breaks = 10)
d.d <- discretize(as.data.frame(the.beatles.songs[, 10]), method = "quantile", breaks = 10)
d.d <- discretize(as.data.frame(the.beatles.songs[, 100]), method = "quantile", breaks = 10)
d.d <- discretize(as.data.frame(the.beatles.songs[, 10]), method = "quantile", breaks = 100)
d.d <- discretize(as.data.frame(the.beatles.songs[, 10]), method = "quantile", breaks = 200)
t.bins <- bins(t, target.bins = 5, minpts = 9)
t.bins$xtbl
t.bins$binct
t.cut_number <- cut_number(t, 5)
t.cut_number <- cut_number(t, 2)
t.cut_number <- cut_number(t, 200)
class(t.bins$binct)
View(discretized.features)
?scale
rm (list = ls())
the.beatles.songs <- readRDS("The Beatles songs dataset, v3.4.RData")
summary(the.beatles.songs)
plot(density(the.beatles.songs$Covered.by))
plot(density(scale(the.beatles.songs$Covered.by))
)
plot(density(the.beatles.songs$Covered.by - 3))
plot(density(the.beatles.songs$Covered.by + 3))
plot(density(the.beatles.songs$Covered.by - 3))
plot(density(the.beatles.songs$Covered.by - 30))
plot(density(the.beatles.songs$Covered.by + 30))
plot(density(the.beatles.songs$Covered.by))
plot(density(the.beatles.songs$Covered.by + 40))
plot(density(log(the.beatles.songs$Covered.by)))
plot(density(log(the.beatles.songs$Covered.by) + 2))
plot(density(log(the.beatles.songs$Covered.by) - 2))
plot(density(log(the.beatles.songs$Top.50.Billboard)))
plot(density(log(the.beatles.songs$Top.50.Rolling.Stone)))
plot(density(log(the.beatles.songs$Top.50.NME)))
plot(density(log(the.beatles.songs$Top.50.NME) - 2.5))
plot(density(log(5 - the.beatles.songs$Top.50.NME)))
plot(density(log(51 - the.beatles.songs$Top.50.NME)))
median(the.beatles.songs$Top.50.NME)
summary(51 - the.beatles.songs$Top.50.NME)
summary(log(51 - the.beatles.songs$Top.50.NME))
View(the.beatles.songs)
rm(list = ls())
the.beatles.songs <- readRDS("The Beatles songs dataset, v3.4.RData")
top.50.billboard <- the.beatles.songs$Top.50.Billboard
the.beatles.songs$Top.50.Billboard <- NULL
saveRDS(object = the.beatles.songs, file = "The Beatles songs dataset, v3.5.RData")
summary(the.beatles.songs)
sd(the.beatles.songs$Top.50.Rolling.Stone)
z.rs <- (the.beatles.songs$Top.50.Rolling.Stone - mean(the.beatles.songs$Top.50.Rolling.Stone)) / sd(the.beatles.songs$Top.50.Rolling.Stone)
plot(density(z.rs))
plot(density(the.beatles.songs$Top.50.Rolling.Stone))
z.rs <- scale(the.beatles.songs$Top.50.Rolling.Stone)
plot(density(z.rs))
knitr::opts_chunk$set(echo = TRUE)
# install.packages("ggplot2")
library(ggplot2)
the.beatles.songs <-
read.csv("The Beatles songs dataset, v3.csv", stringsAsFactors = FALSE)
str(the.beatles.songs)
summary(the.beatles.songs)
the.beatles.songs <- readRDS("The Beatles songs dataset, v3.2.RData")
summary(the.beatles.songs)
the.beatles.songs$Top.50.Billboard
the.beatles.songs$Top.50.BB <- "No"
the.beatles.songs$Top.50.BB
the.beatles.songs$Top.50.BB[the.beatles.songs$Top.50.Billboard > 0] <- "Yes"
the.beatles.songs$Top.50.BB
the.beatles.songs$Top.50.BB <- as.factor(the.beatles.songs$Top.50.BB)
head(the.beatles.songs$Top.50.BB)
saveRDS(object = the.beatles.songs, file = "The Beatles songs dataset, v3.3.RData")
table(the.beatles.songs$Top.50.BB)
prop.table(table(the.beatles.songs$Top.50.BB))
round(prop.table(table(the.beatles.songs$Top.50.BB)), digits = 2)
library(caret)
set.seed(444)
# set.seed(333) - results in a different split, and different tree and evaluation metrics
train.data.indices <- createDataPartition(the.beatles.songs$Top.50.BB, p = 0.80, list = FALSE)
train.data <- the.beatles.songs[train.data.indices, ]
test.data <- the.beatles.songs[-train.data.indices, ]
library(rpart)
top.50.tree.1 <- rpart(Top.50.BB ~ Single.certification + Covered.by + Year,
data = train.data,
method = "class")
print(top.50.tree.1)
library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
fancyRpartPlot(top.50.tree.1)
top.50.predictions.1 <- predict(top.50.tree.1, newdata = test.data, type = "class")
top.50.predictions.1[1:20]
top.50.predictions.1.dataframe <- data.frame(Song = test.data$Title,
Top.50.Billboard = test.data$Top.50.Billboard,
Top.50.BB = test.data$Top.50.BB,
Prediction = top.50.predictions.1)
cm.1 <- table(True = test.data$Top.50.BB, Predicted = top.50.predictions.1)
cm.1
# alternatively:
# cm.1 <- table(True = top.50.predictions.1.dataframe$Top.50.BB,
#               Predicted = top.50.predictions.1.dataframe$Prediction)
# cm.1
source("Evaluation metrics.R")
eval.1 <- getEvaluationMetrics(cm.1)
eval.1
top.50.tree.2 <- rpart(Top.50.BB ~ Duration + Covered.by,
data = train.data,
method = "class")
# top.50.tree.2 <- rpart(Top.50.BB ~ .,       # use almost all variables, excluding some specific ones
#                        data = subset(train.data, select = -c(Title, Top.50.Billboard)),
#                        method = "class")
print(top.50.tree.2)
fancyRpartPlot(top.50.tree.2)
top.50.predictions.2 <- predict(top.50.tree.2, newdata = test.data, type = "class")
top.50.predictions.2[1:20]
top.50.predictions.2.dataframe <- data.frame(Song = test.data$Title,
Top.50.Billboard = test.data$Top.50.Billboard,
Top.50.BB = test.data$Top.50.BB,
Prediction = top.50.predictions.2)
cm.2 <- table(True = test.data$Top.50.BB, Predicted = top.50.predictions.2)
cm.2
eval.2 <- getEvaluationMetrics(cm.2)
eval.2
top.50.tree.3 <- rpart(Top.50.BB ~ Single.certification + Covered.by + Year,
data = train.data,
method = "class",
control = rpart.control(minsplit = 10, cp = 0.001))
print(top.50.tree.3)
fancyRpartPlot(top.50.tree.3)
top.50.predictions.3 <- predict(top.50.tree.3, newdata = test.data, type = "class")
top.50.predictions.3[1:20]
top.50.predictions.3.dataframe <- data.frame(Song = test.data$Title,
Top.50.Billboard = test.data$Top.50.Billboard,
Top.50.BB = test.data$Top.50.BB,
Prediction = top.50.predictions.3)
cm.3 <- table(True = test.data$Top.50.BB, Predicted = top.50.predictions.3)
cm.3
eval.3 <- getEvaluationMetrics(cm.3)
eval.3
data.frame(rbind(eval.1, eval.3),
row.names = c("top.50.tree.1", "top.50.tree.3"))
library(e1071)
library(caret)
folds = trainControl(method = "cv", number = 10)               # 10-fold cross-validation
cpGrid = expand.grid(.cp = seq(from = 0.001, to = 0.05, by = 0.001))
set.seed(11)
train(Top.50.BB ~ Single.certification + Covered.by + Year,
data = train.data,
method = "rpart",
control = rpart.control(minsplit = 10),
trControl = folds, tuneGrid = cpGrid)
top.50.tree.5 <- prune(top.50.tree.3, cp = 0.013)               # cp value found in the previous step (train())
print(top.50.tree.5)
fancyRpartPlot(top.50.tree.5)
top.50.predictions.5 <- predict(top.50.tree.5, newdata = test.data, type = "class")
top.50.predictions.5[1:20]
top.50.predictions.5.dataframe <- data.frame(Song = test.data$Title,
Top.50.Billboard = test.data$Top.50.Billboard,
Top.50.BB = test.data$Top.50.BB,
Prediction = top.50.predictions.5)
cm.5 <- table(True = test.data$Top.50.BB, Predicted = top.50.predictions.5)
cm.5
eval.5 <- getEvaluationMetrics(cm.5)
eval.5
data.frame(rbind(eval.1, eval.3, eval.5),
row.names = c("top.50.tree.1", "top.50.tree.3", "top.50.tree.5"))
the.beatles.songs <- readRDS("The Beatles songs dataset, v3.4.RData")
top.50.billboard <- the.beatles.songs$Top.50.Billboard
the.beatles.songs$Top.50.Billboard <- NULL
saveRDS(object = the.beatles.songs, file = "The Beatles songs dataset, v3.5.RData")
summary(the.beatles.songs)
plot(density(the.beatles.songs$Covered.by))
# ...                                  # check the distributions of other variables as well
source("Rescale numeric variables.R")
the.beatles.songs.rescaled <- rescaleNumericVariables(the.beatles.songs)
library(caret)
set.seed(444)
# set.seed(333) - results in a different split, and different results and eval. metrics
train.data.indices <- createDataPartition(the.beatles.songs.rescaled$Top.50.BB, p = 0.80, list = FALSE)
train.data <- the.beatles.songs.rescaled[train.data.indices, ]
test.data <- the.beatles.songs.rescaled[-train.data.indices, ]
library(class)
top.50.knn.1 <- knn(train = train.data[, -c(1, 10)],    # eliminate Title (non-numeric) and output/class (Top.50.BB)
test = test.data[, -c(1, 10)],      # eliminate Title (non-numeric) and output/class (Top.50.BB)
cl = train.data$Top.50.BB,          # output (class) variable
k = 5)                              # k = 5: random value to start with
head(top.50.knn.1)                                      # these are already the predictions, i.e. no predict() etc.
which(test.data$Top.50.BB != top.50.knn.1)
knn.cm.1 <- table(True = test.data$Top.50.BB, Predicted = top.50.knn.1)
knn.cm.1
source("Evaluation metrics.R")
eval.knn.1 <- getEvaluationMetrics(knn.cm.1)
eval.knn.1
library(e1071)
library(caret)
knn.folds = trainControl(method = "cv", number = 10)            # 10-fold cross-validation
knn.cpGrid = expand.grid(.k = seq(from = 3, to = 25, by = 2))
set.seed(11)
knn.cv <- train(Top.50.BB ~ . - Title,
data = train.data,
method = "knn",
trControl = knn.folds, tuneGrid = knn.cpGrid)
knn.cv
plot(knn.cv)
top.50.knn.2 <- knn(train = train.data[, -c(1, 10)],    # eliminate Title (non-numeric) and output/class (Top.50.BB)
test = test.data[, -c(1, 10)],      # eliminate Title (non-numeric) and output/class (Top.50.BB)
cl = train.data$Top.50.BB,          # output (class) variable
k = 7)                              # k = 7: another random value to test and compare
knn.cm.2 <- table(True = test.data$Top.50.BB, Predicted = top.50.knn.2)
knn.cm.2
eval.knn.2 <- getEvaluationMetrics(knn.cm.2)
eval.knn.2
data.frame(rbind(eval.knn.1, eval.knn.2),
row.names = c("eval.knn.1", "eval.knn.2"))
the.beatles.songs <- readRDS("The Beatles songs dataset, v3.5.RData")
str(the.beatles.songs)
apply(the.beatles.songs[, c(3:4, 7:9)], MARGIN = 2, FUN = shapiro.test)
library(bnlearn)
discretized.features <- discretize(the.beatles.songs[, c(3:4, 7:9)],
method = "interval",
breaks = c(5, 5, 5, 5, 5))
summary(discretized.features)
the.beatles.songs.nb <- cbind(the.beatles.songs[, c(1, 2, 5, 6, 10)], discretized.features)
the.beatles.songs.nb <- the.beatles.songs.nb[, names(the.beatles.songs)]
library(caret)
set.seed(4455)
# set.seed(333) - results in a different split, and different results and eval. metrics
train.data.indices <- createDataPartition(the.beatles.songs.nb$Top.50.BB, p = 0.80, list = FALSE)
train.data <- the.beatles.songs.nb[train.data.indices, ]
test.data <- the.beatles.songs.nb[-train.data.indices, ]
library(e1071)
top.50.nb.1 <- naiveBayes(Top.50.BB ~ .,
data = train.data[, -1])
print(top.50.nb.1)
top.50.nb.predictions.1 <- predict(top.50.nb.1, newdata = test.data[, -1], type = "class")
top.50.nb.predictions.1[1:20]
top.50.nb.predictions.1.dataframe <- data.frame(Song = test.data$Title,
Top.50.BB = test.data$Top.50.BB,
Prediction = top.50.nb.predictions.1)
nb.cm.1 <- table(True = test.data$Top.50.BB, Predicted = top.50.nb.predictions.1)
nb.cm.1
source("Evaluation metrics.R")
eval.nb.1 <- getEvaluationMetrics(nb.cm.1)
eval.nb.1
library(e1071)
top.50.nb.2 <- naiveBayes(Top.50.BB ~ Year + Duration + Other.releases,
data = train.data[, -1])
print(top.50.nb.2)
top.50.nb.predictions.2 <- predict(top.50.nb.2, newdata = test.data[, -1], type = "class")
top.50.nb.predictions.2[1:20]
top.50.nb.predictions.2.dataframe <- data.frame(Song = test.data$Title,
Top.50.BB = test.data$Top.50.BB,
Prediction = top.50.nb.predictions.2)
nb.cm.2 <- table(True = test.data$Top.50.BB, Predicted = top.50.nb.predictions.2)
nb.cm.2
source("Evaluation metrics.R")
eval.nb.2 <- getEvaluationMetrics(nb.cm.2)
eval.nb.2
data.frame(rbind(eval.nb.1, eval.nb.2),
row.names = c("eval.nb.1", "eval.nb.2"))
library(e1071)
top.50.nb.3 <- naiveBayes(Top.50.BB ~ Year + Duration + Other.releases,      # can be the same as for top.50.nb.2
data = train.data[, -1])
top.50.nb.predictions.3 <- predict(top.50.nb.3, newdata = test.data[, -1], type = "raw")
top.50.nb.predictions.3[1:20, ]
top.50.nb.predictions.3.dataframe <- data.frame(Song = test.data$Title,
Top.50.BB = test.data$Top.50.BB,
Prediction.probability.No = top.50.nb.predictions.3[, 1],
Prediction.probability.Yes = top.50.nb.predictions.3[, 2])
library(pROC)
top.50.nb.predictions.3.roc <-
roc(response = test.data$Top.50.BB,
predictor = top.50.nb.predictions.3[, 2])
top.50.nb.predictions.3.roc$auc
plot.roc(top.50.nb.predictions.3.roc,
print.thres = TRUE,
print.thres.best.method = "youden")
top.50.nb.predictions.3.coords <-
coords(top.50.nb.predictions.3.roc,
ret = c("accuracy", "spec", "sens", "thr"),
x = "local maximas")
top.50.nb.predictions.3.coords
?rpart.control
